Read training and tests set from Kaggle
train <- read.csv("TitanicTrain.csv", stringsAsFactors=TRUE)
test <- read.csv("TitanicTest.csv", stringsAsFactors=TRUE)
str(train)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
## $ Gender : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
train$Name = NULL ; test$Name = NULL
train$Ticket = NULL ; test$Ticket = NULL
train$Cabin = NULL ; test$Cabin = NULL
train$Family = train$SibSp + train$Parch ; test$Family = test$SibSp + test$Parch
train$Survived = as.factor(train$Survived)
str(train$Pclass) ; table(train$Pclass) ; summary(train$Pclass)
## int [1:891] 3 1 3 1 3 3 1 3 3 2 ...
##
## 1 2 3
## 216 184 491
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 2.309 3.000 3.000
train$Pclass = factor(train$Pclass, levels = c(1,2,3), labels = c("P1", "P2", "P3"))
str(train$Pclass) ; table(train$Pclass) ; summary(train$Pclass)
## Factor w/ 3 levels "P1","P2","P3": 3 1 3 1 3 3 1 3 3 2 ...
##
## P1 P2 P3
## 216 184 491
## P1 P2 P3
## 216 184 491
test$Pclass = factor(test$Pclass, levels = c(1,2,3), labels = c("P1", "P2", "P3"))